1 Introduction

1.1 RMarkdown set-up

First, basic parameters are set up in this RMarkdown, such as loading dependencies, setting paths and setting up a uniform plot structure.

Most plots in this RMarkdown file are made interactive with ggplot. PDF versions of most plots are in this GitHub repository under ./plots/

knitr::opts_chunk$set(warning=FALSE, message=FALSE)
require(tidyverse)
require(biomaRt)
require(reshape2)
require(broom)
require(RColorBrewer)
require(scales)
require(ggrepel)
require(ggbeeswarm)
require(gridExtra)
require(ggExtra)
require(ggplot2)
require(RCurl)
require(plotly)
require(nnls)
require(ggpubr)
require(DT)

source("../../Resources/HelperFunctions.R")

# plot style
theme_point<-theme_bw()+theme(strip.background = element_blank())

theme_bar<-theme_bw()+theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1),strip.background = element_blank(),axis.ticks.x = element_blank(),axis.line.x = element_blank())

theme_boxplot<-theme_bw()+theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1),strip.background = element_blank(),axis.ticks.x = element_blank(),axis.title.x = element_blank(),axis.line.x = element_blank(),legend.position = "none")

color_panel<-c("#e35d6a","#5bb75b","#428bca","#e87810","#23496b","#ffbf00","#cc2028","#039748","pink","gray","darkgray")

color_panel1 <-  c("#039748","#039748","#ffbf00","#ffbf00","#e35d6a","#e35d6a","#5bb75b","#5bb75b","#428bca","#428bca","#23496b","#23496b","#cc2028","#cc2028","#e87810")

color_panel2 <-  brewer.pal(10, name = "Paired")
names(color_panel2) <- c("serum","Biomatrica","EDTA separator","EDTA","Citrate","RNA Streck","Roche","DNA Streck", "PAXgene", "ACD-A")
full_nr <- format_format(big.mark = ",", decimal.mark = ".", scientific = FALSE)

options(scipen=10000)

# Global variables
data_path <- "../data_clumpify_pipeline/data_RNaseH_Seq_SK_Biogazelle_Exomes/"
sample_annotation<-read_csv("./Annotation_exRNAQC005.csv")

sample_annotation <- sample_annotation %>% filter(RunID == "RNaseH_Seq_SK_Biogazelle_Exomes")
sample_annotation<-sample_annotation %>% mutate(SampleID= paste0(GraphTube,"_",BiologicalReplicate, "_",TimeLapse))
sample_annotation<-sample_annotation %>% mutate(PlasmaInputVml= PlasmaInputV/1000)
sample_annotation$TubeType <- ifelse(sample_annotation$Tube %in% c("DNA Streck", "Biomatrica", "RNA Streck", "PAXgene", "Roche"), "Preservation", "Non-preservation")
sample_annotation$GraphTube <- paste0(sample_annotation$GraphTube, "_", sample_annotation$TimeLapse)
# fold change function + make plots
generateFC <- function(input_df, variable, dfName, AC = FALSE){
fold_change_input_all <- data.frame() 
for (duplicate_type in unique(sample_annotation$Tube)){
  sample_duplicates<-sample_annotation%>% filter(Tube==duplicate_type)
  if(nrow(sample_duplicates)>1){
    #print(duplicate_type)
    #double_positives_sample <-double_positives %>% dplyr::select(MIMATID,sample_duplicates$UniqueID) 
    input_df_sample <- left_join(sample_duplicates, input_df)
    # only keep the replicates of one type
   
    samples_comb <- combn(sample_duplicates$UniqueID,2) #compare 2 of the 3 samples at a time
    for (n_col in 1:ncol(samples_comb)) {
      #print(samples_comb[,n_col])
      nr_runA <- gsub("^[A-Z]+","",sample_annotation[sample_annotation$UniqueID== samples_comb[1,n_col],]$TimeLapse)
      nr_runB <- gsub("^[A-Z]+","",sample_annotation[sample_annotation$UniqueID== samples_comb[2,n_col],]$TimeLapse)
      RepA <- sample_annotation[sample_annotation$UniqueID== samples_comb[1,n_col],]$BiologicalReplicate
      RepB <- sample_annotation[sample_annotation$UniqueID== samples_comb[2,n_col],]$BiologicalReplicate
     if ((RepA == RepB)){
      varname <- paste0("T",nr_runA,"_",nr_runB) #make a name so you can backtrace which replicates are compared
      #print(varname)
      compareVar <- input_df %>% filter(UniqueID == paste(samples_comb[1,n_col]) | UniqueID == paste0(samples_comb[2,n_col])) 
      if (AC == TRUE){
      plot = "absolute change"
      intercept = 0
      if (is.na((abs(compareVar[[variable]][1]) > abs(compareVar[[variable]][2])))){
        correlation_samples<- compareVar %>%
        mutate(foldchange=NA)
      }
      else if (abs(compareVar[[variable]][1]) >= abs(compareVar[[variable]][2])){
        correlation_samples<- compareVar %>%
        mutate(foldchange= abs(compareVar[[variable]][1])-abs(compareVar[[variable]][2]))
      } else {
        correlation_samples<- compareVar %>%
        mutate(foldchange= abs(compareVar[[variable]][2])-abs(compareVar[[variable]][1]))
      }}
      else if (AC == FALSE){
      plot = "fold change"
      intercept = 1
      if (is.na((abs(compareVar[[variable]][1]) > abs(compareVar[[variable]][2])))){
        correlation_samples<- compareVar %>%
        mutate(foldchange=NA)
      }
      else if (abs(compareVar[[variable]][1]) >= abs(compareVar[[variable]][2])){
        correlation_samples<- compareVar %>%
        mutate(foldchange= abs(compareVar[[variable]][1])/abs(compareVar[[variable]][2]))
      } else {
        correlation_samples<- compareVar %>%
        mutate(foldchange= abs(compareVar[[variable]][2])/abs(compareVar[[variable]][1]))
      }}
      
      FC_input<-correlation_samples %>% 
        mutate(Tube=duplicate_type, Replicates=varname, BiologicalReplicate = paste0(RepA,"-",RepB)) %>%
        mutate(ReplID = paste0(Tube,"-",Replicates))
      FC_input <- FC_input %>% mutate(inputType = paste0(dfName))
      if (nrow(FC_input) == 2){
          fold_change_input_all <- rbind(fold_change_input_all, as.data.frame(FC_input))
        }
      }
    }
  }
}
FC <- fold_change_input_all %>% dplyr::group_by(Tube,Replicates,BiologicalReplicate) %>%
  mutate(ReplID = paste0(Tube,"-",Replicates)) %>% select(c(ReplID, foldchange, Replicates, inputType)) %>% distinct(ReplID, .keep_all = TRUE)

FC$TimePoint <- ifelse(FC$Replicates %in% c("T24_0", "T0_24", "T0_04", "T04_0"), "1st timepoint", ifelse(FC$Replicates %in% c("T72_0", "T0_72", "T0_16", "T16_0"), "2nd timepoint", NA))
FC$TubeType <- ifelse(FC$Tube %in% c("DNA Streck", "Biomatrica", "RNA Streck", "PAXgene", "Roche"), "preservation", "non-preservation")
                       
print(ggplot(FC %>% filter(!is.na(TimePoint)), aes(x=reorder(Tube,foldchange, FUN = function(x) -mean(x, na.rm = TRUE)), y = foldchange)) + 
          geom_boxplot() + 
          geom_beeswarm(groupOnX=TRUE, size = 2.5, aes(col = TimePoint, shape = TubeType)) + 
          geom_hline(yintercept = intercept, linetype = "dashed", color = "gray") + 
          labs(subtitle = "ordered by mean (white triangle)", title = paste0(plot, " of ", dfName), y = paste0(plot), x = NULL, col = "comparison", shape = "tube type") +
          scale_fill_manual(values=color_panel) + 
          theme_point +theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1)) +
          stat_summary(
          geom = "point",
          fun.y = "mean",
          col = "black",
          size = 2,
          shape = 24,
          fill = "white"
        )
      )
return(FC)
}

1.2 Experimental setup

For the evaluation of the different blood collection tubes, blood was drawn from 9 healthy volunteers (3 volunteers for the preservation tube, 3 volunteers for the non-preservation plasma tubes and 3 volunteers for the non-preservation serum tubes). The order of all blood tubes was randomized per donor. All blood tubes were filled to the volume recommended by the manufacturer. We refer to the manuscript and supplemental files for more information about experimental setup.

Preservation tubes. We included 5 different preservation tubes (Streck cell-free RNA BCT, Streck cell-free DNA BCT, PAXgene Blood ccfDNA Tube, Roche cell-free DNA Collection tube and Biomatrica LBgard blood tube) with processing at 3 different timepoints after blood collection (immediately (T0), after 24 hours (T24) and after 72 hours (T72)). Thus, 15 blood tubes were drawn per healthy volunteer.

Non-preservation plasma tubes. We included 4 different non-preservation plasma tubes (BD Vacutainer K2E EDTA spray, Vacuette EDTA gel, BD Vacutainer ACD-A and Vacuette Sodium citrate 3.2%) with processing at 3 different timepoints after blood collection (immediately (T0), after 4 hours (T4) and after 16 hours (T16)). Thus, 12 blood tubes were drawn per healthy volunteer.

Non-preservation serum tube. We included 1 non-preservation serum tube (BD Vacutainer SST II Advance) with processing at 3 different timepoints after blood collection (immediately (T0), after 4 hours (T4) and after 16 hours (T16)). Thus, 3 blood tubes were drawn per healthy volunteer. All tubes were collected from one antecubital vein with the BD Vacutainer push button 21G with 12” tube butterfly needle system.

When 15 tubes were collected, 7 tubes were collected from one antecubital vein and 8 tubes were collected from the contralateral antecubital vein with the BD Vacutainer push button 21G, 7” tube with pre-attached holder tube butterfly needle system. When 12 tubes were collected, 6 tubes were collected from one antecubital vein and 6 tubes were collected from the contralateral antecubital vein with the BD Vacutainer push button 21G with 12” tube butterfly needle system.

1.3 Metric selection

Five performance metrics were evaluated. In order to select the tubes that are most stable across time points, we calculated for every tube and donor the fold change across different timepoints (excluding T24-72 and T04-16). Thus, six fold-change values were obtained per tube. These were visualized and tubes were ordered based on the mean of these six values.

  1. Hemolysis (see Fold changes for hemolysis)
  2. Relative RNA concentration, based on endogenous reads vs Sequin spike-in RNA reads (see Fold changes of relative RNA concentration in plasma)
  3. Absolute number of genes detected (after setting a count cut-off of 6 counts, based on the kit study (exRNAQC004), see Fold changes of number of genes)
  4. Percentage of counts mapping on protein coding genes (see Fold changes of protein coding gene count fraction)
  5. Area left of the curve (ALC) calculation (a precision metric) between different timepoints (see Area left of the curve (ALC))

An overview of the mean fold changes is presented at the end of this analysis (see Fold change summary of 5 performance metrics)

An example of the “fold change” plot is depicted below.

values <- c(10, 50, 100, 5, 15, 30)
tube <- c("tube A", "tube A", "tube A", "tube B", "tube B", "tube B")
donor <- c("donor1", "donor1", "donor1", "donor1", "donor1", "donor1")
time <- c("T0", "T24", "T72", "T0", "T24", "T72")
FC <- c(5, 10, NA, 3, NA, 6)
comparison <- c("0h vs 24h", "0h vs 72h", NA, "0h vs 24h", NA, "0h vs 72h")

df_raw <- data.frame(values = values, Tube = tube, BiologicalReplicate = donor, TimeLapse = time, foldchange = FC, Comparison = comparison)

p1 <- ggplot(df_raw,aes(x=TimeLapse,y=as.numeric(`values`), col=BiologicalReplicate, group = BiologicalReplicate))+
  geom_point()+geom_line()+
  theme_point+
  facet_wrap(~Tube, ncol = 5, scales="free_x") +
  theme(panel.grid.major.x=element_line(linetype = "dashed",color="gray88"), axis.text.x=element_text(size=8)) +
  scale_y_continuous(limits = c(0, NA), breaks=values)+
  scale_color_manual(values=color_panel) +
  theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)) + labs(title="plot of evolution over time - example", y = "example value", x = "timepoint", col = NULL)

p2 <- ggplot(df_raw, aes(x=reorder(Tube,foldchange, FUN = function(x) -mean(x, na.rm = TRUE)), y = foldchange)) + 
  geom_boxplot() + geom_beeswarm(groupOnX=TRUE, aes(col = Comparison))  + 
  geom_hline(yintercept = 1, linetype = "dashed", color = "gray") + 
  labs(subtitle = "ordered by mean", title = paste0("fold change plot - example"), y = "fold change", x = "tube", col = NULL) +
  scale_fill_manual(values=color_panel) + theme_point+
  scale_y_continuous(breaks=c(5, 10, 3, 6, 1))+
  stat_summary(
    geom = "point",
    fun.y = "mean",
    col = "black",
    size = 3,
    shape = 24,
    fill = "white"
  )

figure <- ggarrange(
  p1, p2, labels = c("A", "B"),
  common.legend = FALSE, legend = "bottom", ncol = 2
)

figure

2 Sample annotation

An overview of the samples included in this experiment.

3 Platelet counts

We measured platelet counts for all tubes and in all fractions obtained during centrifugation (whole blood, 1st centrifugation step (termed “PRP”), 2nd centrifugation step (termed “PPP”) and the end fraction = plasma after the final centrifugation step (termed “PFP”)). Ideally, the platelet count of the “end fraction” should be very close to zero. In the non-preservation tubes, we can see that the platelet counts stay stable between WB and the 1st centrifugation step. Furthermore, there is no big impact of time delay. However, in the preservation tubes, we can see platelet count rising significantly between WB and the 1st centrifugation step and there are more pronounced differences in platelet count for the PAXgene tube between T24 and T72.

The data can be further explored in the interactive plots. All data was rescaled to whole blood at 100%.

3.1 Overview - standard SOP

This overview only contains the number of platelets in the “end fraction” (relative to whole blood), because we only used this in the RNA-seq experiment. The other plots give the evolution in the different fractions.

3.2 NP - Time 0h

3.2.1 By tube

3.2.2 By donor

3.3 P - Time 0h

3.3.1 By tube

3.3.2 By donor

3.4 Time 4h

3.4.1 By tube

3.4.2 By donor

3.5 Time 16h

3.5.1 By tube

3.5.2 By donor

3.6 Time 24h

3.6.1 By tube

3.6.2 By donor

3.7 Time 72h

3.7.1 By tube

3.7.2 By donor

4 Hemolysis

Hemolysis was measured with Nanodrop (absorbance at 414 nm) in plasma across all tubes. Overall, we can see that there is less hemolysis in non-preservation tubes and there is more hemolysis in preservation tubes and at later timepoints. Hemolysis is also our first metric. Furthermore, Spearman rank correlation between replicate platelet measurements is high (R = 0.82).

4.1 Hemolysis in plasma

5 Sequencing and preprocessing

5.1 Run metrics

The samples were sequenced on 2019-03-16 on a NovaSeq 6000, S2 flow cell, 2x75 cycles, Illumina. Basic metrics:

  • %PF: 71.39%
  • % ≥Q30: 94.72%
  • Yield: 674.50 Gbp

The raw data is available at: RNAseH_Seq_SK_Biogazelle_Exomes

Some samples did not receive many reads compared to the others. This seems to be related to the tube type (DNA Streck, RNA streck and one Biomatrica tube). Four out of seven samples with read % < 0.5 are from donor 2. After library construction, it was observed that some samples had a low concentration based on the Fragment Analyser run and Kapa qPCR. Therefore, during equimolar pooling, a lower amount than the required amount (to reach equimolarity) had to be added. We observed that the samples that did not reach a high enough concentration are the same samples that have a low number of reads going to them. Hover on points to see sample IDs.

Indexing QC

5.2 Pipeline parameters and MultiQC reports

MultiQC reports of the fastq files can be found in this GitHub repository under logs/20200116.RNaseH_Seq_SK_Biogazelle_Exomes/fastq_only_multiqc.html

MultiQC reports of the pipeline can be found in this GitHub repository under logs/20200116.RNaseH_Seq_SK_Biogazelle_Exomes/pipeline_multiqc_report.html

The script of the pipeline to see the parameters can be found in this GitHub repository under logs/20200116.RNaseH_Seq_SK_Biogazelle_Exomes/20191126_ClumpifyPipelineStrandedness.py and logs/20200116.RNaseH_Seq_SK_Biogazelle_Exomes/runPipelineStrandedness.sh.

5.3 Number of reads in different stages of the pipeline

All reads were subsampled to 3.6 M reads so that there is a fair comparison between all tubes (e.g. if one sample is sequenced deeper it will probably yield more genes compared to a sample that was sequenced less deep).

The reads were counted on the fastq level and plotted in the following plots. We used these commands to count the reads in the fastq files:

for sample in $(ls | grep "RNA0"); do lines=4; qcfil_lines=`zcat ${sample}/${sample}_1.fastq.gz | wc -l`; echo $((qcfil_lines/lines)) $(echo $sample | cut -d'-' -f1); done >> lines_fastq_pre.txt

for sample in $(ls | grep "RNA0"); do lines=4; qcfil_lines=`zcat ${sample}/dedup_clumpify-subs_atstart/${sample}_1_qc_subs.fastq.gz | wc -l`; echo $((qcfil_lines/lines)) $(echo $sample | cut -d'-' -f1); done >> lines_fastq_subs.txt

for sample in $(ls | grep "RNA0"); do lines=4; qcfil_lines=`zcat ${sample}/dedup_clumpify-subs_atstart/${sample}_1_qc_subs_clumped.fastq.gz | wc -l`; echo $((qcfil_lines/lines)) $(echo $sample | cut -d'-' -f1); done >> lines_fastq_post.txt
pd <- position_dodge(0.2)
reads_pre = read.table(paste0(data_path, "lines_fastq_pre.txt"),header=FALSE, col.names = c("pre", "UniqueID")) 
reads_subs = read.table(paste0(data_path, "lines_fastq_subs.txt"),header=FALSE, col.names = c("subs", "UniqueID")) 
reads_post = read.table(paste0(data_path, "lines_fastq_post.txt"),header=FALSE, col.names = c("post", "UniqueID")) 

reads <- full_join(reads_pre,reads_subs,by=c("UniqueID"))
reads <- full_join(reads,reads_post,by=c("UniqueID"))
reads_melt <- reads %>% melt(value.name = "reads")
reads$duplicates <- round(1 - reads$post/reads$subs,4)

reads <- reads %>% select(UniqueID, duplicates, post, subs, pre)

duplicates <- full_join(reads, sample_annotation, by = c("UniqueID"))
reads_melt <- full_join(reads_melt, sample_annotation, by = c("UniqueID"))

pd <- position_dodge(0.2)

# ggplot(reads_melt,aes(x=reorder(variable, desc(variable)),y=reads, group = PlasmaID, color = Tube))+
#      geom_line(position = pd)+
#     geom_point(position = pd) +
#     facet_wrap(~RunID, ncol = 3)+scale_y_continuous(trans = "log10") +
#     geom_hline(yintercept = 3000000, linetype = "dashed", color = "gray")+
#     labs(title="Reads per sequencing run", y = "# reads", x = "stage in pipeline") +
#     theme_bar+ theme(axis.text.x=element_text(angle=90, hjust=1)) +
#     scale_x_discrete(labels = c("raw read count","read count after subsampling", "reads count after pipeline"), limits=c("pre", "subs", "post"))+
#     scale_color_manual(values=color_panel)

reads_overview <- duplicates %>% select("UniqueID", "SampleID", "pre", "subs", "post", "duplicates")

# Reads should be subsampled to: reads_melt %>% filter(variable == "pre") %>% select(reads) %>% min()

5.5 Duplicate rate

As Clumpify (version bundled with BBMap version 38.28) does not output a log file with duplicate stats, we estimated duplicate reads by dividing the number of reads after Clumpify by the number of reads after subsampling (as this is the only step that removes reads, so all removed reads are duplicates). As expected because of the low RNA input, duplicate percentages range from 85-99%.

5.7 Quality control metrics - table

5.8 Reading count data

Our pipeline was written to count reads using Kallisto, based on Ensembl v91. In this step, we make the raw count dataframes, as well as the counts per million (cpm) and transcripts per kilobase million (tpm) dataframes. The ERCC/Sequin spikes are filtered out, we group transcripts per gene and then sum the counts or tpm.

ensembl <- useEnsembl(biomart="ensembl",dataset="hsapiens_gene_ensembl", version = 91)
genes_ens <- getBM(attributes=c('ensembl_gene_id','ensembl_transcript_id'),mart=ensembl)
genes_ens<-rbind(genes_ens,c("rDNA45S","gi|555853|gb|U13369.1|HSU13369"))

if (!file.exists("./kallisto_counts.tsv") & !file.exists("./kallisto_tpm.tsv")){
  kallisto<-as.data.frame(genes_ens)
  kallisto_tpm<-as.data.frame(genes_ens)
  files<-list.files(data_path,recursive=TRUE)
  files<-files[grep("abundance.tsv", files)]
  
  for(i in 1:length(files)){
    tmp<-data.table::fread(paste0(data_path,files[i]), header=T, sep="\t", data.table=FALSE)
    name_sample<-gsub(".*/","",sub("-[0-9]*/dedup_clumpify-subs_atstart/[0-9]*_klout","",sub("/abundance.tsv","",files[i])))
    tmp1<-tmp[,c("target_id","est_counts")]
    tmp2<-tmp[,c("target_id","tpm")]
    colnames(tmp1)<-c("target_id",name_sample)
    colnames(tmp2)<-c("target_id",name_sample)
    kallisto<-full_join(kallisto,tmp1,by=c("ensembl_transcript_id"="target_id"))
    kallisto_tpm<-full_join(kallisto_tpm,tmp2,by=c("ensembl_transcript_id"="target_id"))
  }
  write_tsv(kallisto, "./kallisto_counts.tsv")
  write_tsv(kallisto_tpm, "./kallisto_tpm.tsv")
} else {
  kallisto <- read_tsv("./kallisto_counts.tsv")
  kallisto_tpm <- read_tsv("./kallisto_tpm.tsv")
}

kallisto <- as.tibble(kallisto)
kallisto_tpm <- as.tibble(kallisto_tpm)

kallisto <- kallisto[rowSums(kallisto %>% select_if(is.numeric),na.rm=TRUE)>0,] 
kallisto_tpm <- kallisto_tpm[rowSums(kallisto_tpm %>% select_if(is.numeric),na.rm=TRUE)>0,]

kallisto$ensembl_gene_id <- ifelse(grepl("ERCC", kallisto$ensembl_transcript_id), "ERCC", 
                                   ifelse(grepl("R1|R2", kallisto$ensembl_transcript_id), "Sequin",
                                   kallisto$ensembl_gene_id))

kallisto_tpm$ensembl_gene_id <- ifelse(grepl("ERCC", kallisto_tpm$ensembl_transcript_id), "ERCC", 
                                   ifelse(grepl("R1|R2", kallisto_tpm$ensembl_transcript_id), "Sequin",
                                   kallisto_tpm$ensembl_gene_id))

kallisto_gene_level <- kallisto %>% filter(!str_detect(ensembl_transcript_id,"Spike")) %>% group_by(ensembl_gene_id) %>% summarise_if(is.numeric, sum, na.rm=TRUE) 

kallisto_genes <- kallisto %>% filter(!str_detect(ensembl_transcript_id,"ERCC|R1|R2|Spike")) %>% group_by(ensembl_gene_id) %>% summarise_if(is.numeric, sum, na.rm=TRUE) #filter out spikes and group counts by gene_id (instead of transcript_id)

kallisto_genes_tpm <- kallisto_tpm %>% filter(!str_detect(ensembl_transcript_id,"ERCC|R1|R2|Spike")) %>% group_by(ensembl_gene_id) %>% summarise_if(is.numeric, sum, na.rm=TRUE)

kallisto_cpm <- as.data.frame(apply(kallisto %>% select_if(is.numeric),2,function(x) {10^6*x/sum(x,na.rm=TRUE)}))
kallisto_cpm$ensembl_transcript_id <- kallisto$ensembl_transcript_id
kallisto_cpm$ensembl_gene_id <- kallisto$ensembl_gene_id

kallisto_genes_cpm<-kallisto_cpm %>% filter(!str_detect(ensembl_transcript_id,"ERCC|R1|R2|Spike")) %>% group_by(ensembl_gene_id) %>% summarise_if(is.numeric, sum, na.rm=TRUE)

6 ERCC and Sequin RNA spikes

In our samples, we add two types of spikes:

  1. Sequin spikes were added to the plasma lysate (during RNA isolation) and are thus indicative of RNA extraction efficiency. The number of reads going to Sequin spikes is inversely correlated with the amount of endogenous RNA.
  2. ERCC spikes were added to the RNA eluate (after RNA isolation) and are thus indicative of RNA yield (if corrected for the elution volume, but in this experiment the elution volume is identical accross all samples). Similarly, more ERCC spikes indicates less endogenous RNA in eluate.

The original spike-in concentrations are described in annotation files coming from the providers.

6.1 Total amount of spikes

We can look at the absolute number of spike counts. However, this is not that informative, it is better to look at ratio of endogenous RNA reads to spike reads (see Endogenous reads vs spike reads).

6.2 ERCC

  • **ERCC spikes are added to RNA eluate*

6.2.1 ERCC TPM

  • These plots are based on TPM values as this corrects for length of the spike.
  • Each facet focuses on different spike: for this particular spike the TPM in every sample is shown (triplicates are directly next to each other)
    • Facets (spikes) ordered from high to low according to mean TPM of respective spike
    • Darker colour corresponds to higher concentration in the original mix (according to manufacturer, so the mix that was added to the eluate)
    • Small discrepancy between concentration order and TPM order, but in general good correspondance.
  • In the top plots: similar pattern in terms of TPM of the respective spike
    • the replicates are next to each other in x-axis and their ERCC TPM appears to be similar (reproducible pattern per triplicate)
    • Here, we actually see two parameters reflected: amount of endogous RNA (from higher input and/or better extraction -> more endogenous -> less ERCC) & library prep variability
  • DNA Streck and RNA Streck are outliers in these plots, indicating that there are more spike reads and fewer endogenous reads in these tubes.

TPM per spike (different plots) and per sample (x-axis)

6.2.2 Linear models

  • Linear model for each sample based on the expression of the spikes (because the spikes were added in a known concentration).
    • Linear models based on the 20 highest spikes (according to rowmeans) because these spikes are picked up in (almost) every sample
  • Plot shows that there is indeed a good fit.

6.3 Sequin

  • Sequin spikes are added during RNA isolation.
  • Sequin input volume was proportional to plasma input volume (2 μl Sequin per 200 μl plasma). Thus, the ratio of Sequin spikes to endogenous RNA is the same at the start of the experiment.

6.3.1 Sequin TPM

These plots are based on TPM values as this corrects for length of the spike. The ratio Sequin to endogenous RNA is the same in every tube in start of experiment + all sequencing data subsampled to 3.6 M. Thus, we expect to have an equal count of Sequins in every tube if all tubes perform equally. The differences that we see here are not related to differences in concentrations at the start

  • What causes remaining variability?
    • different tubes results in endogenous RNA differences
    • extraction efficiency (less efficient extraction results in less Sequins (and endogenous RNA) in eluate)
    • PCR duplicate removal (more reads removed in some samples)
  • Each facet on the image focuses on different spike: for that spike the TPM in every sample is shown (triplicates are directly next to each other)
    • Facets on the figures (= spikes) are ordered from high to low according to mean TPM of respective spike
      • Darker colour corresponds to higher concentration in the original mix (according to manufacturer)
      • Again, some discrepancy between concentration order and TPM order
    • Even for the spikes with relatively high concentration in the mix, the spike is not detected in some samples
      • Some spikes seem to drop out very quickly

6.3.2 Linear models

  • Linear model for each sample based on the expression of the spikes (because the spikes were added in a known concentration).
    • Linear models based on the 20 highest spikes (according to rowmeans) because these spikes are picked up in (almost) every sample
  • Plot shows that in general, the higher the concentration in the mix, the higher the TPM, but the R^2 values are a lot lower than for ERCCs (similar to exRNAQC004).

6.4 Endogenous reads vs spike reads

6.4.1 Endogenous vs Sequins

The volume of Sequin spikes that is added is proportional to plasma input volume (1 microL Sequin/ 100 microL plasma)

  • If the RNA-isolation does not selectively favour Sequins or endogenous RNA, the ratio Sequin vs endogenous RNA should be constant throughout experiment.
  • In absence of length bias, differences in Sequin/endogenous are related to tube content, experimental error (not exact input volume) or bias towards RNA content.
ggsave("./plots/SeqVsEndo.pdf", plot = ggplot2::last_plot(), dpi = 300)

# Plot original ratio Sequin/endo
spikes1 <- ggplot(gene_level_ratios, aes(x=GraphTube, y=EndovsSeq, fill=Tube, colour=Tube)) +
  #geom_bar(stat="identity") +
  geom_point(size=1.2) +
  labs(x="", y="endogenous/sequin") +
  scale_fill_manual(values=color_panel) +
  scale_colour_manual(values=color_panel) +
  theme_bar +
  labs(x="", y="endogenous/sequin", title="endogenous RNA vs Sequin counts")
#calculate statistics for Sequin/endogenous (sd, sem, 95% ci)
test <- log_rescaling_CI(gene_level_ratios, measurevar="EndovsSeq", groupvar=c("GraphTube"))
# Plot ERCC/endo in log10 scale
spikes2 <- ggplot(test, aes(x=GraphTube, y=10^measurevar_log_resc)) + 
  #geom_bar(position=position_dodge(), stat="identity")+
  geom_errorbar(aes(ymin=ci_lower_oriscale, ymax=ci_upper_oriscale), colour="grey", width=.1) +
  geom_point(aes(colour=Tube), size=1.2) +
  geom_point(data=test, aes(x=GraphTube, y=mean_oriscale), shape=4, colour="grey") +
  geom_hline(yintercept = 1, linetype="dashed",colour="grey88") +
  labs(x="", y="relative Endo/Sequin", title="endogenous vs Sequin counts (rescaled)", col = "") +
  scale_colour_manual(values=color_panel) +
  scale_y_log10() +
  theme_bar 

p1 <- ggplot(test,aes(x=TimeLapse,y=10^measurevar_log_resc, col=BiologicalReplicate, group = BiologicalReplicate))+
  geom_point()+geom_line()+
  theme_point+
  facet_wrap(~Tube, ncol = 5, scales="free_x") +
  theme(panel.grid.major.x=element_line(linetype = "dashed",color="gray88"), axis.text.x=element_text(size=8)) +
  scale_y_continuous(limits = c(0, NA))+
  scale_color_manual(values=color_panel) +
  theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)) + 
  scale_y_log10() +
  labs(x="", y="relative Endogenous/Sequin", title="endogenous counts vs Sequin (rescaled to lowest)", col = "")

ggplotly(p1)

6.4.2 Fold changes of relative RNA concentration in plasma

In this metric, we can observe that the RNA concentration in the plasma is much more variable in the preservation tubes, while the fold changes in the non-preservation tubes are very close to 1.

6.4.3 Relative RNA concentration in eluate

The ratio of endogenous RNA to ERCC is inverse proportional to the concentration of endogenous RNA in the eluate. More endogenous RNA in eluate after RNA isolation results in proportionally fewer reads going to ERCC spikes, and thus a higher ratio of endogenous/ERCC.

#calculate statistics for endogenous/ERCC (sd, sem, 95% ci)
test <- log_rescaling_CI(gene_level_ratios, measurevar="EndovsERCC", groupvar=c("GraphTube"))
# Plot ERCC/endo in log10 scale
spikes_conc <- ggplot(test, aes(x=GraphTube, y=10^measurevar_log_resc)) + 
  #geom_bar(position=position_dodge(), stat="identity")+
  geom_errorbar(aes(ymin=ci_lower_oriscale, ymax=ci_upper_oriscale), colour="grey", width=.1) +
  geom_point(aes(colour=Tube), size=1.2) +
  geom_point(data=test, aes(x=GraphTube, y=mean_oriscale), shape=4, colour="grey") +
  geom_hline(yintercept = 1, linetype="dashed",colour="grey88") +
  labs(x="", y="Relative endogenous RNA concentration", title="RNA concentration", subtitle="Endogenous RNA vs ERCC (rescaled)") +
  scale_colour_manual(values=color_panel) +
  scale_y_log10() +
  theme_bar 
#ggplotly(spikes_conc)

tmp_summary <- gene_level_ratios %>% group_by(GraphTube) %>% dplyr::summarize(mean_EndovsERCC= mean(EndovsERCC), sd_EndovsERCC=sd(EndovsERCC)) %>% mutate(CV_EndovsERCC = sd_EndovsERCC/mean_EndovsERCC*100) %>% left_join(unique(dplyr::select(sample_annotation, c("Tube","GraphTube"))), by="GraphTube")

p1 <- ggplot(gene_level_ratios, aes(x = EndovsERCC, y = EndovsSeq, col = Tube)) + 
  geom_point()+geom_smooth(method = "lm", se = TRUE, aes(col = FALSE))+
  theme_point + geom_abline(intercept=0, slope=1, color = "gray", linetype = "dashed")+
  theme(panel.grid.major.x=element_line(linetype = "dashed",color="gray88"), axis.text.x=element_text(size=8)) +
  scale_color_manual(values=color_panel) +
  scale_y_continuous(trans = "log2") + scale_x_continuous(trans = "log2") +
  theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)) + labs(x = "endogenous/ERCC", y = "endogenous/Sequin", title = "correlation of rel. RNA concentration in plasma (Sequin) and eluate (ERCC)", col = NULL) + 
  stat_cor(method = "spearman", aes(col = FALSE, label=paste(..r.label.., cut(..p..,breaks = c(-Inf, 0.0001, 0.001, 0.01, 0.05, Inf), labels = c("'****'", "'***'", "'**'", "'*'", "'ns'")), sep = "~")))
p1

7 Cut-off

In order to remove genes with very few counts (and thus to improve repeatability between two replicates) we set a count cut-off at 6 counts. We based this cut-off on the results from exRNAQC004 (the RNA extraction kit study), where we had technical replicates for the combination of miRNeasy serum/plasma and EDTA tubes. Only the protein-coding genes that reach the cut-off are retained.

7.1 Number of genes

ensembl <- useEnsembl(biomart="ensembl",dataset="hsapiens_gene_ensembl", version = 91)
genes_ens <- getBM(attributes=c('ensembl_gene_id','gene_biotype'),mart=ensembl)
#genes_ens <- data.table::fread(paste0(data_path,"gene_biotypes_ensemblv91.txt"), header=T, data.table = F)

kallisto_genes_long <- kallisto_genes %>% gather(., key="UniqueID", value="est_counts", -ensembl_gene_id) %>% #long format
  left_join(., dplyr::select(sample_annotation,c(UniqueID,GraphTube,SampleID, Tube, BiologicalReplicate, TimeLapse)), by="UniqueID") %>% #add kit column
  left_join(., genes_ens, by="ensembl_gene_id") #add gene biotype

#keep only protein coding genes with more than 0 counts
kallisto_genes_long <- kallisto_genes_long %>% filter(est_counts > 0) %>% 
  filter(gene_biotype=="protein_coding")


number_genes_detected <- kallisto_genes_long %>% group_by(SampleID) %>% dplyr::summarize(genes_above0=n()) 
number_genes_detected <- full_join(number_genes_detected, 
                                   kallisto_genes_long %>% group_by(SampleID) %>%
                                     dplyr::summarize(total_est_counts_above0=sum(est_counts)), 
                                   by="SampleID")


#cutoff_kit <- single_pos %>% group_by(GraphTube) %>% dplyr::summarise(median_th = median(filter_threshold))

kallisto_genes_cutoff <- kallisto_genes %>% gather(., key="UniqueID", value="est_counts", -ensembl_gene_id) %>% #long format
  left_join(., dplyr::select(sample_annotation,c(UniqueID,GraphTube,SampleID, Tube, BiologicalReplicate, TimeLapse)), by="UniqueID") %>% #add kit column
  left_join(., genes_ens, by="ensembl_gene_id") #%>% #add gene biotype
  #left_join(., cutoff_kit, by="GraphTube") #add the median cut-off for each kit

kallisto_genes_cutoff <- kallisto_genes_cutoff %>% 
  filter(est_counts > 6) %>% 
  filter(gene_biotype=="protein_coding")

number_genes_detected <- full_join(number_genes_detected, 
                                   kallisto_genes_cutoff %>% group_by(SampleID) %>%
                                     dplyr::summarize(genes_aboveTh=n()),
                                   by="SampleID")
number_genes_detected <- full_join(number_genes_detected, 
                                   kallisto_genes_cutoff %>% group_by(SampleID) %>%
                                     dplyr::summarize(total_est_counts_aboveTh=sum(est_counts)), 
                                   by="SampleID")

number_genes_detected <- left_join(number_genes_detected, 
                                   dplyr::select(sample_annotation, c(UniqueID, GraphTube, RNAisolation, EluateV,SampleID, Tube, BiologicalReplicate, TimeLapse)),
                                   by="SampleID")
#convert to the original format
kallisto_genes_cutoff <- dplyr::select(kallisto_genes_cutoff, ensembl_gene_id, UniqueID, est_counts, Tube, BiologicalReplicate, TimeLapse) %>% 
  spread(., key=UniqueID, value=est_counts)

#write.csv(kallisto_genes_cutoff, file="../../../exRNAQC/kallisto_genes_cutoff.csv",row.names=FALSE, na="")

There is a substantial drop in number of genes after applying the cut-off. For example, the RNA Streck tube only has 300-500 genes left.

8 Area left of the curve (ALC)

  • The ALC (area-left-of-curve) calculation is done according to (see Mestdagh et al., Nature Methods, 2014) and represents a precision metric:
    1. Only genes that reach threshold (95% single positive (SP) elimination based on exRNAQC004) are taken into account.
    2. All zero counts are replaced by NA
    3. The log2 ratios of counts for each gene are determined, each time between 2 timepoints
    4. Then, the absolute value of these log2 ratios are taken and ranked. This is plotted for every condition
  • The faster the curve reaches 100% -> the smaller the ALC -> the better (indicates that the replicates give similar counts for each gene)

The ALC values are summarized based on the average of the ALC values (after reloving NAs). The dot plot at the end gives an overview of all these values.

Score: lower ALC = better

8.1 ALC (no 0’s and 1 > cut-off) summary

#print(all_plot + labs(title="Pairwise ALCs"))
 
ALC_melt <- left_join(ALC, sample_annotation[,c("Tube")], by="Tube")
ALC$BiologicalReplicate <- gsub("[-]TUBE[0-9]*","", ALC$BiologicalReplicate)
ALC$Replicates <- gsub("^Rep04_0$","T0_04", ALC$Replicates)
ALC$Replicates <- gsub("^Rep0_04$","T0_04", ALC$Replicates)
ALC$Replicates <- gsub("^Rep16_0$","T0_16", ALC$Replicates)
ALC$Replicates <- gsub("^Rep0_16$","T0_16", ALC$Replicates)
ALC$Replicates <- gsub("^Rep16_04$","T04_16", ALC$Replicates)
ALC$Replicates <- gsub("^Rep04_16$","T04_16", ALC$Replicates)
ALC$Replicates <- gsub("^Rep24_0$","T0_24", ALC$Replicates)
ALC$Replicates <- gsub("^Rep0_24$","T0_24", ALC$Replicates)
ALC$Replicates <- gsub("^Rep72_0$","T0_72", ALC$Replicates)
ALC$Replicates <- gsub("^Rep0_72$","T0_72", ALC$Replicates)
ALC$Replicates <- gsub("^Rep72_24$","T24_72", ALC$Replicates)
ALC$Replicates <- gsub("^Rep24_72$","T24_72", ALC$Replicates)
#ALC_melt <- ALC_melt %>% filter(Replicates %in% c("Rep0_04", "Rep04_16", "Rep0_16", "Rep0_24", "Rep24_72", "Rep0_72"))

ALC$TimePoint <- ifelse(ALC$Replicates %in% c("T24_0", "T0_24", "T0_04", "T04_0"), "1st timepoint", ifelse(ALC$Replicates%in% c("T72_0", "T0_72", "T0_16", "T16_0"), "2nd timepoint", NA))
ALC$TubeType <- ifelse(ALC$Tube %in% c("DNA Streck", "Biomatrica", "RNA Streck", "PAXgene", "Roche"), "Preservation", "Non-preservation")

p1 <- ggplot(ALC %>% filter(!is.na(TimePoint)), aes(x=reorder(Tube,ALC_calc, FUN = function(x) -mean(x, na.rm = TRUE)), y = 2^ALC_calc)) + geom_boxplot() + geom_beeswarm(groupOnX=TRUE, size = 2.5, aes(col = TimePoint, shape = TubeType)) + geom_hline(yintercept = 1, linetype = "dashed", color = "gray") + labs(subtitle = "ordered by mean", title = "pairwise ALCs (both > 0, at least 1 > cut-off)", y = "linear ALC", x = "tube", col = "comparison", shape = "tube type") +
  scale_fill_manual(values=color_panel) + theme_point + theme(axis.text.x = element_text(angle = 90)) +
  stat_summary(
    geom = "point",
    fun.y = "mean",
    col = "black",
    size = 3,
    shape = 24,
    fill = "white"
  )

p1

9 Gene biotype

For every sample, we have plotted the percentage of counts and genes mapping to which biotypes. We expect that almost all genes that are detected are protein coding genes (as we apply an RNA exome capture-based method). Due to non-specific hybrid capture, also non-protein coding genes may be enriched.

genes_ens <- getBM(attributes=c('ensembl_gene_id','hgnc_symbol','gene_biotype','chromosome_name'),mart=ensembl)

genes_ens$gene_biotype[grep("protein coding",genes_ens$gene_biotype)]<-"protein coding"
genes_ens$gene_biotype[grep("pseudogene",genes_ens$gene_biotype)]<-"pseudogene"
genes_ens$gene_biotype[grep("TR",genes_ens$gene_biotype)]<-"other"
genes_ens$gene_biotype[grep("IG",genes_ens$gene_biotype)]<-"other"
genes_ens$gene_biotype[grep("TEC",genes_ens$gene_biotype)]<-"other"
genes_ens$gene_biotype[grep("^sc",genes_ens$gene_biotype)]<-"other"
genes_ens$gene_biotype[grep("ribozyme",genes_ens$gene_biotype)]<-"other"
genes_ens$gene_biotype[grep("miRNA",genes_ens$gene_biotype)]<-"other"
genes_ens$gene_biotype[grep("vault",genes_ens$gene_biotype)]<-"other"
genes_ens$gene_biotype[grep("antisense_RNA",genes_ens$gene_biotype)]<-"lncRNA"
genes_ens$gene_biotype[grep("lincRNA",genes_ens$gene_biotype)]<-"lncRNA"
genes_ens$gene_biotype[grep("lncRNA",genes_ens$gene_biotype)]<-"lncRNA"
genes_ens$gene_biotype[grep("overlapping",genes_ens$gene_biotype)]<-"lncRNA"
genes_ens$gene_biotype[grep("^sense",genes_ens$gene_biotype)]<-"lncRNA"
genes_ens$gene_biotype[grep("non_coding",genes_ens$gene_biotype)]<-"lncRNA"
genes_ens$gene_biotype[grep("processed_transcript",genes_ens$gene_biotype)]<-"lncRNA"
genes_ens$gene_biotype[grep("^s.*RNA$",genes_ens$gene_biotype)]<-"s(no)RNA"
genes_ens$gene_biotype[grep("MT",genes_ens$chromosome_name)]<-"MT_gene"
genes_ens$gene_biotype[grep("RNR",genes_ens$hgnc_symbol)]<-"MT_RNRgene"

9.1 Biotype composition based on TPM

If we look at the individual samples, we notice that there is a lot more variability in terms of composition in the preservation tubes, compared to the non-preservation tubes.

9.2 Fold changes of protein coding gene count fraction

10 Fold change summary of 5 performance metrics

We can conclude, based on this data-analysis and these metrics, that the preservation tubes are too variable in terms of fold-changes to be of further use. They do not claim what they do: preservation of the content based on T0. The non-preservation tubes were not tested to T72, but these are also not marketed to do that. We chose 16 hours as latest realistic timepoint in a routine lab.

For phase II of the exRNAQC study, we selected three tubes to proceed, i.e. EDTA, citrate and serum, based on their performance and availability in a routine hospital setting.

figure_L <- ggarrange(
  hemolysis_lineplot + labs(title = "hemolysis in PFP", col = NULL, x = ""), 
  RNAconc_lineplot + labs(title = "ratio endogenous counts vs Sequin spike-in", col = NULL, y = "endogenous/Sequin"), 
  geneCount_lineplot + labs(title = "absolute number of protein coding genes", col = NULL, y = "number of protein coding genes"), 
  biotype_lineplot + labs(title = "evolution of protein coding gene count fraction", col = NULL, y = "protein coding counts / total counts * 100"), 
  labels = c("A", "B", "C", "D", "E"),
  common.legend = TRUE, legend = "bottom", ncol = 2, nrow = 2
  )
ggsave(figure_L, filename = "./plots/line_individ_overview.png", dpi = 300, height=12, width=12)
ggsave(figure_L, filename = "./plots/line_individ_overview.pdf", dpi = 300, height=12, width=12)
ggsave(figure_L, filename = "./plots/line_individ_overview.svg", dpi = 300, height=12, width=12)


figure_R <- ggarrange(
  hemolysis_FC_plot + labs(subtitle = NULL), 
  RNAconc_FC_plot + labs(subtitle = NULL), 
  geneCount_FC_plot + labs(subtitle = NULL), 
  ALC_FC_plot + 
        labs(subtitle = NULL, title = "area left of the curve", x = "tube") +
        theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1)), 
  biotype_FC_plot + 
        labs(subtitle = NULL, title = "fold change of protein coding gene count percentage"), labels = c("A", "B", "C", "D", "E"),
  common.legend = TRUE, legend = "bottom", ncol = 2, nrow = 3
  )
ggsave(figure_R, filename = "./plots/FC_individ_overview.png", dpi = 300, height=12, width=9)
ggsave(figure_R, filename = "./plots/FC_individ_overview.pdf", dpi = 300, height=12, width=9)
ggsave(figure_R, filename = "./plots/FC_individ_overview.svg", dpi = 300, height=12, width=9)

11 Sessioninfo

## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## Random number generation:
##  RNG:     Mersenne-Twister 
##  Normal:  Inversion 
##  Sample:  Rounding 
##  
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] gdtools_0.2.2      plyr_1.8.6         DT_0.16            ggpubr_0.4.0      
##  [5] nnls_1.4           plotly_4.9.2.1     RCurl_1.98-1.2     ggExtra_0.9       
##  [9] gridExtra_2.3      ggbeeswarm_0.6.0   ggrepel_0.8.2      scales_1.1.1      
## [13] RColorBrewer_1.1-2 broom_0.7.2        reshape2_1.4.4     biomaRt_2.46.0    
## [17] forcats_0.5.0      stringr_1.4.0      dplyr_1.0.2        purrr_0.3.4       
## [21] readr_1.4.0        tidyr_1.1.2        tibble_3.0.4       ggplot2_3.3.2     
## [25] tidyverse_1.3.0   
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_1.4-1     ggsignif_0.6.0       ellipsis_0.3.1      
##   [4] rio_0.5.16           fs_1.5.0             rstudioapi_0.11     
##   [7] farver_2.0.3         bit64_4.0.5          AnnotationDbi_1.52.0
##  [10] fansi_0.4.1          lubridate_1.7.9      xml2_1.3.2          
##  [13] splines_4.0.3        knitr_1.30           jsonlite_1.7.1      
##  [16] Cairo_1.5-12.2       dbplyr_1.4.4         shiny_1.5.0         
##  [19] compiler_4.0.3       httr_1.4.2           backports_1.1.10    
##  [22] Matrix_1.2-18        assertthat_0.2.1     fastmap_1.0.1       
##  [25] lazyeval_0.2.2       cli_2.1.0            later_1.1.0.1       
##  [28] htmltools_0.5.0      prettyunits_1.1.1    tools_4.0.3         
##  [31] gtable_0.3.0         glue_1.4.2           rappdirs_0.3.1      
##  [34] Rcpp_1.0.5           carData_3.0-4        Biobase_2.50.0      
##  [37] cellranger_1.1.0     vctrs_0.3.4          svglite_1.2.3.2     
##  [40] nlme_3.1-149         crosstalk_1.1.0.1    xfun_0.19           
##  [43] openxlsx_4.2.3       rvest_0.3.6          mime_0.9            
##  [46] miniUI_0.1.1.1       lifecycle_0.2.0      rstatix_0.6.0       
##  [49] XML_3.99-0.5         hms_0.5.3            promises_1.1.1      
##  [52] parallel_4.0.3       yaml_2.2.1           curl_4.3            
##  [55] memoise_1.1.0        stringi_1.5.3        RSQLite_2.2.1       
##  [58] S4Vectors_0.28.0     BiocGenerics_0.36.0  zip_2.1.1           
##  [61] systemfonts_0.3.2    rlang_0.4.8          pkgconfig_2.0.3     
##  [64] bitops_1.0-6         lattice_0.20-41      evaluate_0.14       
##  [67] htmlwidgets_1.5.2    labeling_0.4.2       cowplot_1.1.0       
##  [70] bit_4.0.4            tidyselect_1.1.0     magrittr_1.5        
##  [73] R6_2.5.0             IRanges_2.24.0       generics_0.1.0      
##  [76] DBI_1.1.0            mgcv_1.8-33          pillar_1.4.6        
##  [79] haven_2.3.1          foreign_0.8-80       withr_2.3.0         
##  [82] abind_1.4-5          modelr_0.1.8         crayon_1.3.4        
##  [85] car_3.0-10           BiocFileCache_1.14.0 rmarkdown_2.5       
##  [88] progress_1.2.2       grid_4.0.3           readxl_1.3.1        
##  [91] data.table_1.13.2    blob_1.2.1           reprex_0.3.0        
##  [94] digest_0.6.27        xtable_1.8-4         httpuv_1.5.4        
##  [97] openssl_1.4.3        stats4_4.0.3         munsell_0.5.0       
## [100] beeswarm_0.2.3       viridisLite_0.3.0    vipor_0.4.5         
## [103] askpass_1.1